source("./04_simulation_00_setup.R")

## Generate Data

load("./generated/data/recoded_broockman_kalla_data.RData")

## limit sample data to where t1 index is not missing
sample <- sample %>% filter(!is.na(trans.tolerance.dv.t1))

set.seed(39861)

# Covariates used for projection models in main analysis
covariates <- c('vf_female', 'vf_black', 'vf_white', 'vf_age',
                names(sample)[str_detect(names(sample), "ideology_t0_factor|religious_t0_factor|pid_t0_factor")][-c(1:3)])

## OLS Truth
## estimate linear model
lin_Y0 <- lm( as.formula(paste0("trans.tolerance.dv.t1 ~ ", paste0(covariates, collapse = " + "))),
              data = sample %>% filter(treat_ind == 0))
lin_Y1 <- lm( as.formula(paste0("trans.tolerance.dv.t1 ~ ", paste0(covariates, collapse = " + "))),
              data = sample %>% filter(treat_ind == 1))


lin_y0_sd <- sd(lin_Y0$residuals)
lin_y1_sd <- sd(lin_Y1$residuals)

## predict on to expt to do scaling
pop$Y0_linear <- predict(lin_Y0, newdata = pop)
pop$Y1_linear <- predict(lin_Y1, newdata = pop)
sample$Y0_linear <- predict(lin_Y0, newdata = sample)
sample$Y1_linear <- predict(lin_Y1, newdata = sample)

## do scaling to effect of 1
tau_linear_pop <- pop$Y1_linear - pop$Y0_linear
tau_linear_samp <- sample$Y1_linear - sample$Y0_linear
sample$Y0_linear <- sample$Y0_linear + rnorm(nrow(sample), 0, lin_y0_sd)
sample$Y1_linear <- sample$Y0_linear + tau_linear_samp * 1/mean(tau_linear_pop) + rnorm(nrow(sample), 0, lin_y1_sd)
sample$Y_linear <- ifelse(sample$treat_ind, sample$Y1_linear, sample$Y0_linear)

## re-estimate to get new coefs
lin_Y0 <- lm( as.formula(paste0("Y_linear ~ ", paste0(covariates, collapse = " + "))),
              data = sample %>% filter(treat_ind == 0))
lin_Y1 <- lm( as.formula(paste0("Y_linear ~ ", paste0(covariates, collapse = " + "))),
              data = sample %>% filter(treat_ind == 1))


## re-predict on to pop
pop$Y0_linear <- predict(lin_Y0, newdata = pop)
pop$Y1_linear <- predict(lin_Y1, newdata = pop)

## Check true pop effect
mean(pop$Y1_linear - pop$Y0_linear)


## BART Truth
bart_mod <- bartc(response = sample$trans.tolerance.dv.t1,
                  treatment = sample$treat_ind,
                  confounders = as.matrix(sample[, covariates]),
                  method.trt = "none", keepTrees = TRUE)

# make prediction chunks
pop$chunk = as.numeric(cut(1:nrow(pop), nrow(pop) %/% 1000))

pop$Y0_bart <- lapply(unique(pop$chunk), function(chunk) {
  colMeans(predict(bart_mod,
                   newdata = pop[pop$chunk == chunk, covariates],
                   type = "y.0"))
}) %>% unlist()

pop$Y1_bart <- lapply(unique(pop$chunk), function(chunk) {
  colMeans(predict(bart_mod,
                   newdata = pop[pop$chunk == chunk, covariates],
                   type = "y.1"))
}) %>% unlist()

bart_y0_sd <- sd(pop$Y0_bart)
bart_y1_sd <- sd(pop$Y1_bart)

sample$Y0_bart <- colMeans(predict(bart_mod,
                                   newdata = sample[, covariates],
                                   type = "y.0"))

sample$Y1_bart <- colMeans(predict(bart_mod,
                                   newdata = sample[, covariates],
                                   type = "y.1"))



## do scaling to 1 sd effect
tau_bart_pop <- pop$Y1_bart - pop$Y0_bart
tau_bart_samp <- sample$Y1_bart - sample$Y0_bart
sample$Y0_bart <- sample$Y0_bart + rnorm(nrow(sample), 0, bart_y0_sd)
sample$Y1_bart <- sample$Y0_bart + tau_bart_samp * 1/mean(tau_bart_pop) + rnorm(nrow(sample), 0, bart_y1_sd)
sample$Y_bart <- ifelse(sample$treat_ind, sample$Y1_bart, sample$Y0_bart)

## re-estimate to get new coefs
bart_mod <- bartc(response = sample$Y_bart,
                  treatment = sample$treat_ind,
                  confounders = as.matrix(sample[, covariates]),
                  method.trt = "none", keepTrees = TRUE)

## re-predict on to pop
pop$Y0_bart <- lapply(unique(pop$chunk), function(chunk) {
  colMeans(predict(bart_mod,
                   newdata = pop[pop$chunk == chunk, covariates],
                   type = "y.0"))
}) %>% unlist()

pop$Y1_bart <- lapply(unique(pop$chunk), function(chunk) {
  colMeans(predict(bart_mod,
                   newdata = pop[pop$chunk == chunk, covariates],
                   type = "y.1"))
}) %>% unlist()

## Check true pop effect
mean(pop$Y1_bart - pop$Y0_bart)

## choose the smaller of the errors so that the simulations
## have the same noise
y0_sd <- min(lin_y0_sd, bart_y0_sd)
y1_sd <- min(lin_y1_sd * 1/mean(tau_linear_pop), bart_y1_sd * 1/mean(tau_bart_pop))

## update experimental outcome
sample$Y0_linear <- predict(lin_Y0, newdata = sample)
sample$Y1_linear <- predict(lin_Y1, newdata = sample)

mean(sample$Y1_linear - sample$Y0_linear)

sample$Y0_bart <- colMeans(predict(bart_mod,
                                   newdata = sample[, covariates],
                                   type = "y.0"))

sample$Y1_bart <- colMeans(predict(bart_mod,
                                   newdata = sample[, covariates],
                                   type = "y.1"))

mean(sample$Y1_bart - sample$Y0_bart)

pop$tau_linear <- pop$Y1_linear - pop$Y0_linear
pop$S_linear <- rbinom(nrow(pop), size = 1, prob = ifelse(pop$tau_linear < quantile(pop$tau_linear, probs = c(0.75)), 0.035, 0.01))

selection_model <- glm(as.formula(paste0("S_linear ~ ", paste0(covariates, collapse = " + "))), data = pop, family = binomial(link = "logit"))

expt_include_linear = predict(selection_model, newdata = pop, type = "response")

pop$tau_bart <- pop$Y1_bart - pop$Y0_bart
pop$S_bart <- rbinom(nrow(pop), size = 1, prob = ifelse(pop$tau_bart < quantile(pop$tau_bart, probs = c(0.05)), 0.95, 0.05))

selection_model <- glm(as.formula(paste0("S_bart ~ ", paste0(covariates, collapse = " + "))), data = pop, family = binomial(link = "logit"))

expt_include_bart = predict(selection_model, newdata = pop, type = "response")

save(pop, expt_include_bart, expt_include_linear, covariates, y0_sd, y1_sd, file = "./generated/appendix_2/naturalistic_sim_data.RData")